En este notebook estaré resolviendo los puntos del ejercicio tecnico.
Primero, importaré las librerias y cargaré los datos.
#Primero, cargamos las librerias
import pandas as pd
import numpy as np
import plotly.express as px
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Ahora, declaro las primeras variables de entorno con los paths para cargar la data
path = "/home/kevin/Desktop/python_files/varios/interview-quilmes/"
clientes = "clientes.csv"
desafios_mes = "desafios_mes.csv"
puntos_mes = "puntos_mes.csv"
transacciones_mes = "transacciones_mes.csv"
# Cargo, con pandas, los distintos datos como dataframes
df_clientes = pd.read_csv(path+clientes)
df_transacciones_mes = pd.read_csv(path+transacciones_mes)
df_desafios_mes = pd.read_csv(path+desafios_mes)
df_puntos_mes = pd.read_csv(path+puntos_mes)
Los dataframes son los siguientes:
df_clientes
| cliente_id | canal_desc | subchannel_desc | bees_start | rewards_start | |
|---|---|---|---|---|---|
| 0 | 119251 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 1 | 181737 | Tradicional | Almacen | 2018-11 | 2021-01 |
| 2 | 181871 | Tradicional | Almacen | 2018-11 | 2021-03 |
| 3 | 183637 | Kioscos/maxikioscos | Kiosco/maxikiosco | 2021-03 | 2021-04 |
| 4 | 184485 | Tradicional | Almacen | 2021-03 | 2021-03 |
| ... | ... | ... | ... | ... | ... |
| 1163 | 1383431 | Kioscos/maxikioscos | Kiosco Ventana | 2022-09 | 2022-09 |
| 1164 | 1383525 | Tradicional | Almacen | 2021-03 | 2021-03 |
| 1165 | 1383573 | Tradicional | Almacen | 2021-04 | 2021-09 |
| 1166 | 1383761 | Tradicional | Almacen | 2021-03 | 2021-03 |
| 1167 | 1392101 | Tradicional | Almacen | 2021-03 | 2021-04 |
1168 rows × 5 columns
df_puntos_mes
| cliente_id | mes | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | |
|---|---|---|---|---|---|---|
| 0 | 202975.0 | 2022-08 | 0.0 | 1935.0 | 1 | 2124.0 |
| 1 | 202975.0 | 2022-09 | 0.0 | 0.0 | 0 | 2124.0 |
| 2 | 202975.0 | 2022-10 | 0.0 | 2685.0 | 1 | 3216.0 |
| 3 | 202975.0 | 2022-11 | 373.0 | 3745.0 | 1 | 160.0 |
| 4 | 202975.0 | 2022-12 | 481.0 | 0.0 | 0 | 928.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 16165 | 199337.0 | 2023-06 | 0.0 | 0.0 | 0 | 7716.0 |
| 16166 | 199337.0 | 2023-07 | 0.0 | 0.0 | 0 | 8292.0 |
| 16167 | 199337.0 | 2023-08 | 0.0 | 0.0 | 0 | 8366.0 |
| 16168 | 199337.0 | 2023-09 | 0.0 | 0.0 | 0 | 8453.0 |
| 16169 | 199337.0 | 2023-10 | 0.0 | 0.0 | 0 | 8453.0 |
16170 rows × 6 columns
df_desafios_mes
| cliente_id | mes | asignados | vistos | aceptados | completados | |
|---|---|---|---|---|---|---|
| 0 | 202975.0 | 2022-08 | 1 | 0 | 0 | 0 |
| 1 | 202975.0 | 2022-10 | 1 | 1 | 1 | 0 |
| 2 | 202975.0 | 2022-11 | 3 | 0 | 0 | 0 |
| 3 | 202975.0 | 2022-12 | 1 | 0 | 0 | 0 |
| 4 | 202975.0 | 2023-01 | 1 | 1 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 13137 | 199337.0 | 2023-05 | 1 | 0 | 0 | 0 |
| 13138 | 199337.0 | 2023-06 | 1 | 0 | 0 | 0 |
| 13139 | 199337.0 | 2023-07 | 1 | 0 | 0 | 0 |
| 13140 | 199337.0 | 2023-08 | 1 | 0 | 0 | 0 |
| 13141 | 199337.0 | 2023-09 | 1 | 0 | 0 | 0 |
13142 rows × 6 columns
df_transacciones_mes
| cliente_id | mes | transacciones | volumen | bultos | productos_unicos | marcas_unicas | mix_latas | mix_retornable | mix_premium | bultos_por_transaccion | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 119251 | 2022-11 | 4 | 3.14 | 37.0 | 24 | 9 | 47.74 | 21.45 | 53.71 | 9.25 |
| 1 | 119251 | 2022-12 | 5 | 2.04 | 26.0 | 12 | 7 | 46.36 | 21.93 | 41.83 | 5.20 |
| 2 | 119251 | 2023-01 | 7 | 6.42 | 82.0 | 20 | 8 | 73.99 | 11.97 | 31.09 | 11.71 |
| 3 | 119251 | 2023-02 | 4 | 5.48 | 57.0 | 18 | 9 | 79.18 | 13.82 | 11.49 | 14.25 |
| 4 | 119251 | 2023-03 | 5 | 4.07 | 40.0 | 18 | 7 | 86.11 | 11.77 | 13.08 | 8.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11128 | 1392101 | 2023-06 | 2 | 1.06 | 12.0 | 4 | 4 | 15.44 | 68.44 | 38.25 | 6.00 |
| 11129 | 1392101 | 2023-07 | 4 | 2.39 | 26.0 | 6 | 4 | 40.44 | 55.99 | 8.47 | 6.50 |
| 11130 | 1392101 | 2023-08 | 3 | 1.04 | 12.0 | 5 | 5 | 2.73 | 97.27 | 11.27 | 4.00 |
| 11131 | 1392101 | 2023-09 | 3 | 2.15 | 28.0 | 13 | 7 | 35.62 | 56.46 | 10.56 | 9.33 |
| 11132 | 1392101 | 2023-10 | 4 | 1.76 | 19.0 | 15 | 8 | 33.57 | 56.77 | 24.52 | 4.75 |
11133 rows × 11 columns
#Para tener nocion de los valores de los distintos dataframe, corremos el siguiente bloque de codigo
len(df_clientes["cliente_id"].unique()),len(df_transacciones_mes["cliente_id"].unique()),len(df_desafios_mes["cliente_id"].unique()),len(df_puntos_mes["cliente_id"].unique())
(1168, 1125, 1078, 1078)
df_clientes.dtypes
cliente_id int64 canal_desc object subchannel_desc object bees_start object rewards_start object dtype: object
Integrar los datos de las distintas fuentes y construir las muestras de desarrollo y validación para el modelo de clasificación; para, en última instancia, generar los desafíos del mes siguiente (al más reciente disponible para desafíos).
# Convertir "cliente_id" a int64 en todos los DataFrames
df_clientes['cliente_id'] = df_clientes['cliente_id'].astype('int64')
df_desafios_mes['cliente_id'] = df_desafios_mes['cliente_id'].astype('int64')
df_puntos_mes['cliente_id'] = df_puntos_mes['cliente_id'].astype('int64')
df_transacciones_mes['cliente_id'] = df_transacciones_mes['cliente_id'].astype('int64')
# Luego, procede con la fusión de datos como lo hicimos anteriormente
# chequeamos los clientes
clients_set=set(df_clientes["cliente_id"].unique())
desafios_set=set(df_desafios_mes["cliente_id"].unique())
puntos_set=set(df_puntos_mes["cliente_id"].unique())
transacciones_set=set(df_transacciones_mes["cliente_id"].unique())
transacciones_set.difference(puntos_set)
{794735,
926493,
927149,
928163,
929083,
929827,
930227,
930561,
931061,
931347,
931619,
931963,
932161,
932531,
932783,
933687,
933727,
934129,
934237,
934457,
934459,
934523,
934855,
935041,
935253,
935483,
935527,
935623,
935693,
935961,
936069,
936131,
936199,
936241,
936269,
936293,
936531,
936721,
936837,
936949,
936989,
937029,
937071,
937073,
937213,
937215,
937359,
937537,
937541,
937567,
937591,
937653,
937697,
938003,
938185,
938805,
938899,
939021,
939309,
1375927,
1376219}
# estudiamos un poco cada dataframe
df_puntos_mes.sort_values("mes")["mes"].unique()
array(['2022-08', '2022-09', '2022-10', '2022-11', '2022-12', '2023-01',
'2023-02', '2023-03', '2023-04', '2023-05', '2023-06', '2023-07',
'2023-08', '2023-09', '2023-10'], dtype=object)
# Ahora combinamos los datos, comenzando por desafios_mes, puntos_mes y transacciones y luego df_clientes
first_merge=df_desafios_mes.merge(df_puntos_mes,on=["cliente_id","mes"],how="outer")
second_merge=first_merge.merge(df_transacciones_mes,on=["cliente_id","mes"],how="outer")
df=second_merge.merge(df_clientes,on="cliente_id",how="left")
df
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | productos_unicos | marcas_unicas | mix_latas | mix_retornable | mix_premium | bultos_por_transaccion | canal_desc | subchannel_desc | bees_start | rewards_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202975 | 2022-08 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1935.0 | 1.0 | 2124.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 1 | 202975 | 2022-10 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 2685.0 | 1.0 | 3216.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 2 | 202975 | 2022-11 | 3.0 | 0.0 | 0.0 | 0.0 | 373.0 | 3745.0 | 1.0 | 160.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 3 | 202975 | 2022-12 | 1.0 | 0.0 | 0.0 | 0.0 | 481.0 | 0.0 | 0.0 | 928.0 | ... | 4.0 | 4.0 | 24.70 | 75.30 | 0.0 | 13.00 | Tradicional | Almacen | 2021-03 | 2021-03 |
| 4 | 202975 | 2023-01 | 1.0 | 1.0 | 1.0 | 0.0 | 3679.0 | 0.0 | 0.0 | 5039.0 | ... | 3.0 | 3.0 | 0.00 | 100.00 | 0.0 | 7.00 | Tradicional | Almacen | 2021-03 | 2021-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16359 | 1376219 | 2023-06 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.0 | 2.0 | 5.17 | 42.66 | 100.0 | 1.75 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16360 | 1376219 | 2023-07 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.0 | 1.0 | 0.00 | 100.00 | 100.0 | 1.00 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16361 | 1376219 | 2023-08 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 2.0 | 0.00 | 41.58 | 100.0 | 1.50 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16362 | 1376219 | 2023-09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 2.0 | 76.43 | 0.00 | 100.0 | 3.00 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16363 | 1376219 | 2023-10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 2.0 | 0.00 | 58.30 | 100.0 | 1.67 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
16364 rows × 23 columns
Una vez hecho el merge, chequeamos cuantos valores quedaron nulos.
dff=df.copy()
for i in dff.columns:
print(f"{i}: {len(dff[dff[i].isnull()])}")
cliente_id: 0 mes: 0 asignados: 3222 vistos: 3222 aceptados: 3222 completados: 3222 puntos_ganados_total: 194 puntos_canjeados_total: 194 cantidad_canjes: 194 puntos_saldo_fin_mes: 194 transacciones: 5231 volumen: 5231 bultos: 5231 productos_unicos: 5231 marcas_unicas: 5231 mix_latas: 5231 mix_retornable: 5231 mix_premium: 5231 bultos_por_transaccion: 5231 canal_desc: 0 subchannel_desc: 0 bees_start: 0 rewards_start: 0
nan_checker=first_merge.copy()
for i in nan_checker.columns:
print(f"{i}: {len(nan_checker[nan_checker[i].isnull()])}")
cliente_id: 0 mes: 0 asignados: 3028 vistos: 3028 aceptados: 3028 completados: 3028 puntos_ganados_total: 0 puntos_canjeados_total: 0 cantidad_canjes: 0 puntos_saldo_fin_mes: 0
df[df.isnull().any(axis=1)]
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | productos_unicos | marcas_unicas | mix_latas | mix_retornable | mix_premium | bultos_por_transaccion | canal_desc | subchannel_desc | bees_start | rewards_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202975 | 2022-08 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1935.0 | 1.0 | 2124.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 1 | 202975 | 2022-10 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 2685.0 | 1.0 | 3216.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 2 | 202975 | 2022-11 | 3.0 | 0.0 | 0.0 | 0.0 | 373.0 | 3745.0 | 1.0 | 160.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 9 | 202975 | 2023-06 | 4.0 | 3.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2451.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 |
| 14 | 203693 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 1295.0 | 1568.0 | 1.0 | 1321.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Kioscos/maxikioscos | Kiosco/maxikiosco | 2021-03 | 2021-04 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16359 | 1376219 | 2023-06 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.0 | 2.0 | 5.17 | 42.66 | 100.0 | 1.75 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16360 | 1376219 | 2023-07 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.0 | 1.0 | 0.00 | 100.00 | 100.0 | 1.00 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16361 | 1376219 | 2023-08 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 2.0 | 0.00 | 41.58 | 100.0 | 1.50 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16362 | 1376219 | 2023-09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 2.0 | 76.43 | 0.00 | 100.0 | 3.00 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
| 16363 | 1376219 | 2023-10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 2.0 | 0.00 | 58.30 | 100.0 | 1.67 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 |
6898 rows × 23 columns
#ordenamos los valores por mes para que sea mas comodo de ver
df=df.sort_values("mes")
Previo a buscar un modelo, es importante explorar visualmente los datos para tener el primer acercamiento a los mismos.
df.columns
Index(['cliente_id', 'mes', 'asignados', 'vistos', 'aceptados', 'completados',
'puntos_ganados_total', 'puntos_canjeados_total', 'cantidad_canjes',
'puntos_saldo_fin_mes', 'transacciones', 'volumen', 'bultos',
'productos_unicos', 'marcas_unicas', 'mix_latas', 'mix_retornable',
'mix_premium', 'bultos_por_transaccion', 'canal_desc',
'subchannel_desc', 'bees_start', 'rewards_start'],
dtype='object')
En el siguiente grafico observamos la gran diferencia entre desafios asignados y desafios completados, dejando una gran brecha la cual explica el por qué de querer correr un modelo de clasificacion para optimizar la cantidad de desafios enviados. Por otro lado, los vistos y aceptados se comportan practicamente igual a lo largo de la serie.
px.line(df.groupby("mes",as_index=False).sum(),x="mes",y=["asignados","vistos", "aceptados", "completados"]).show()
/tmp/ipykernel_25267/3676538993.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
px.line(df.groupby("mes",as_index=False).sum(),x="mes",y=["asignados","vistos", "aceptados", "completados"]).show()
df[df["completados"]==1].vistos.unique()
array([1., 0., 2., 3., 4., 5.])
Un detalle interesante de este dataframe, es que hay desafios completados pero que no fueron vistos. Pasaron directamente de asignados a ser aceptados.
df_desafios_mes.loc[df_desafios_mes["completados"]==1].loc[df["vistos"]==0]
#df_desafios_mes.loc[df_desafios_mes["completados"]==1].loc[df["aceptados"]==0]
| cliente_id | mes | asignados | vistos | aceptados | completados | |
|---|---|---|---|---|---|---|
| 1351 | 378299 | 2023-07 | 3 | 0 | 3 | 1 |
| 1352 | 378299 | 2023-08 | 3 | 0 | 3 | 1 |
| 3692 | 777915 | 2023-07 | 3 | 0 | 2 | 1 |
| 3694 | 777915 | 2023-09 | 2 | 0 | 2 | 1 |
| 5322 | 837371 | 2023-04 | 2 | 0 | 2 | 1 |
| 5412 | 840857 | 2022-08 | 1 | 0 | 1 | 1 |
| 5644 | 844225 | 2022-08 | 1 | 0 | 1 | 1 |
| 5658 | 844227 | 2022-08 | 1 | 0 | 1 | 1 |
| 5804 | 850561 | 2022-08 | 1 | 0 | 1 | 1 |
| 5841 | 851459 | 2022-08 | 1 | 0 | 1 | 1 |
| 6048 | 855373 | 2023-03 | 1 | 0 | 1 | 1 |
| 6429 | 859677 | 2023-01 | 1 | 0 | 1 | 1 |
| 6430 | 859677 | 2023-02 | 1 | 0 | 1 | 1 |
| 6431 | 859677 | 2023-04 | 1 | 0 | 1 | 1 |
| 6436 | 859677 | 2023-09 | 3 | 0 | 3 | 1 |
| 7408 | 874513 | 2023-08 | 3 | 0 | 3 | 1 |
| 7847 | 878043 | 2023-04 | 2 | 0 | 2 | 1 |
| 7853 | 878043 | 2023-10 | 1 | 0 | 1 | 1 |
| 8056 | 882181 | 2022-08 | 1 | 0 | 1 | 1 |
| 9031 | 893873 | 2023-10 | 2 | 0 | 2 | 1 |
| 9230 | 897225 | 2023-04 | 1 | 0 | 1 | 1 |
| 9232 | 897225 | 2023-06 | 4 | 0 | 4 | 1 |
| 9233 | 897225 | 2023-07 | 3 | 0 | 3 | 1 |
| 9234 | 897225 | 2023-08 | 3 | 0 | 3 | 1 |
| 11306 | 924963 | 2023-10 | 2 | 0 | 2 | 1 |
Como observamos en el siguiente grafico, son mayores los clientes que completan desafios que se les envia 2 veces en lugar de solo una (en proporcion)
fig = px.parallel_categories(df, dimensions=["asignados","vistos","completados"],color="completados"
)
fig.show()
df_puntos_mes
| cliente_id | mes | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | |
|---|---|---|---|---|---|---|
| 0 | 202975 | 2022-08 | 0.0 | 1935.0 | 1 | 2124.0 |
| 1 | 202975 | 2022-09 | 0.0 | 0.0 | 0 | 2124.0 |
| 2 | 202975 | 2022-10 | 0.0 | 2685.0 | 1 | 3216.0 |
| 3 | 202975 | 2022-11 | 373.0 | 3745.0 | 1 | 160.0 |
| 4 | 202975 | 2022-12 | 481.0 | 0.0 | 0 | 928.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 16165 | 199337 | 2023-06 | 0.0 | 0.0 | 0 | 7716.0 |
| 16166 | 199337 | 2023-07 | 0.0 | 0.0 | 0 | 8292.0 |
| 16167 | 199337 | 2023-08 | 0.0 | 0.0 | 0 | 8366.0 |
| 16168 | 199337 | 2023-09 | 0.0 | 0.0 | 0 | 8453.0 |
| 16169 | 199337 | 2023-10 | 0.0 | 0.0 | 0 | 8453.0 |
16170 rows × 6 columns
for id,id_data in df_puntos_mes.groupby("cliente_id"):
display(id_data)
break
| cliente_id | mes | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | |
|---|---|---|---|---|---|---|
| 14235 | 119251 | 2022-08 | 1912.0 | 0.0 | 0 | 87445.0 |
| 14236 | 119251 | 2022-09 | 0.0 | 0.0 | 0 | 87445.0 |
| 14237 | 119251 | 2022-10 | 0.0 | 0.0 | 0 | 92277.0 |
| 14238 | 119251 | 2022-11 | 187.0 | 0.0 | 0 | 96522.0 |
| 14239 | 119251 | 2022-12 | 0.0 | 0.0 | 0 | 96522.0 |
| 14240 | 119251 | 2023-01 | 4200.0 | 0.0 | 0 | 112513.0 |
| 14241 | 119251 | 2023-02 | 838.0 | 0.0 | 0 | 119985.0 |
| 14242 | 119251 | 2023-03 | 0.0 | 0.0 | 0 | 119985.0 |
| 14243 | 119251 | 2023-04 | 0.0 | 0.0 | 0 | 127227.0 |
| 14244 | 119251 | 2023-05 | 2176.0 | 0.0 | 0 | 130271.0 |
| 14245 | 119251 | 2023-06 | 3517.0 | 0.0 | 0 | 135909.0 |
| 14246 | 119251 | 2023-07 | 8008.0 | 0.0 | 0 | 144445.0 |
| 14247 | 119251 | 2023-08 | 3142.0 | 138915.0 | 1 | 36873.0 |
| 14248 | 119251 | 2023-09 | 3876.0 | 0.0 | 0 | 42694.0 |
| 14249 | 119251 | 2023-10 | 3621.0 | 0.0 | 0 | 47971.0 |
lens=[]
for id,id_data in df_puntos_mes.groupby("cliente_id"):
lens.append(len(id_data.loc[id_data["cantidad_canjes"]==1]))
px.histogram(lens)
A continuacion observamos que hay algunos pocos clientes que poseen los puntos mas altos.
px.line(df,x="mes",y="puntos_saldo_fin_mes",color="cliente_id").show()
px.line(df.groupby("mes",as_index=False).sum(),x="mes",y="puntos_saldo_fin_mes").show()
/tmp/ipykernel_25267/1925604108.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
for id,id_data in df.groupby("cliente_id"):
display(id_data)
break
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | productos_unicos | marcas_unicas | mix_latas | mix_retornable | mix_premium | bultos_por_transaccion | canal_desc | subchannel_desc | bees_start | rewards_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11444 | 119251 | 2022-08 | 1.0 | 1.0 | 1.0 | 1.0 | 1912.0 | 0.0 | 0.0 | 87445.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 15933 | 119251 | 2022-09 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 87445.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11445 | 119251 | 2022-10 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 92277.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11446 | 119251 | 2022-11 | 3.0 | 1.0 | 1.0 | 0.0 | 187.0 | 0.0 | 0.0 | 96522.0 | ... | 24.0 | 9.0 | 47.74 | 21.45 | 53.71 | 9.25 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 15934 | 119251 | 2022-12 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 96522.0 | ... | 12.0 | 7.0 | 46.36 | 21.93 | 41.83 | 5.20 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11447 | 119251 | 2023-01 | 1.0 | 1.0 | 1.0 | 1.0 | 4200.0 | 0.0 | 0.0 | 112513.0 | ... | 20.0 | 8.0 | 73.99 | 11.97 | 31.09 | 11.71 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11448 | 119251 | 2023-02 | 1.0 | 1.0 | 1.0 | 0.0 | 838.0 | 0.0 | 0.0 | 119985.0 | ... | 18.0 | 9.0 | 79.18 | 13.82 | 11.49 | 14.25 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 15935 | 119251 | 2023-03 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 119985.0 | ... | 18.0 | 7.0 | 86.11 | 11.77 | 13.08 | 8.00 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11449 | 119251 | 2023-04 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 127227.0 | ... | 9.0 | 7.0 | 52.60 | 18.90 | 28.50 | 5.00 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11450 | 119251 | 2023-05 | 1.0 | 1.0 | 1.0 | 0.0 | 2176.0 | 0.0 | 0.0 | 130271.0 | ... | 10.0 | 6.0 | 41.18 | 46.60 | 16.18 | 12.50 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11451 | 119251 | 2023-06 | 3.0 | 3.0 | 3.0 | 1.0 | 3517.0 | 0.0 | 0.0 | 135909.0 | ... | 11.0 | 7.0 | 57.50 | 22.12 | 17.28 | 6.33 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11452 | 119251 | 2023-07 | 3.0 | 3.0 | 3.0 | 0.0 | 8008.0 | 0.0 | 0.0 | 144445.0 | ... | 4.0 | 4.0 | 56.49 | 43.51 | 0.00 | 2.00 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11453 | 119251 | 2023-08 | 3.0 | 3.0 | 3.0 | 1.0 | 3142.0 | 138915.0 | 1.0 | 36873.0 | ... | 6.0 | 5.0 | 81.63 | 0.00 | 14.48 | 2.75 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11454 | 119251 | 2023-09 | 2.0 | 2.0 | 2.0 | 0.0 | 3876.0 | 0.0 | 0.0 | 42694.0 | ... | 24.0 | 11.0 | 52.59 | 43.15 | 15.05 | 11.00 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
| 11455 | 119251 | 2023-10 | 2.0 | 2.0 | 2.0 | 2.0 | 3621.0 | 0.0 | 0.0 | 47971.0 | ... | 13.0 | 8.0 | 55.03 | 44.97 | 8.13 | 5.25 | Autoservicio | Autoservicio Independiente | 2018-11 | 2021-05 |
15 rows × 23 columns
#puntos_ganados_total puntos_canjeados_total"
px.line(df.groupby("mes",as_index=False).sum(),x="mes",y=["puntos_ganados_total", "puntos_canjeados_total","puntos_saldo_fin_mes"]).show()
# px.line(df.sort_values("mes"),x="mes",y="puntos_ganados_total",color="cliente_id").show()
/tmp/ipykernel_25267/518665167.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
Notese que utilizamos la libreria numpy para pasar a lograritmo natural aquellas variables que tienen pocos valores muy altos y muchos valores pequeños. De esta manera, es mas facil de visualizarlo y trabajar con estos datos.
Se entiende, tambien, por qué se utilizan los principales canales (primera imagen) y se desglosan en la segunda. Estos, representan a la mayoria del volumen.
df["ln_volumen"]=np.log(df["volumen"])
px.histogram(df,x="ln_volumen",color="canal_desc").show()
px.histogram(df,x="ln_volumen",color="subchannel_desc").show()
Clasificacion de cada canal.
fig = px.treemap(df, path=["canal_desc", 'subchannel_desc', ], values='volumen',
color='subchannel_desc')
fig.show()
Otro dato interesante es que febrero, marzo y abril de 2021 se dieron de alta la mayoria de los clientes al programa de fidelizacion y a la plataforma.
px.histogram(df,x="bees_start",color="canal_desc").show()
px.histogram(df,x="rewards_start",color="canal_desc").show()
fig = px.treemap(df, path=["bees_start", 'rewards_start', ],
color='subchannel_desc')
fig.show()
Luego, observamos que las transacciones pueden pensarse como variables dummy, ya que algunos clientes utilizan envases de latas y otros no (en vez de verlo como porcentaje, se puede ver como True o False).
df["ln_volumen"]=np.log(df["volumen"])
df["ln_bultos_por_transaccion"]=np.log(df["bultos_por_transaccion"]) #Transformacion
var="mix_latas"
px.histogram(df,x=var,color="canal_desc").show()
px.histogram(df,x=var,color="subchannel_desc").show()
Objetivos
Notese que antes de transformar los datos, siempre usamos un backup.
#df_back=df.copy()
Previamente se mencionó que febero, marzo y abril tenian casi todo el volumen de datos, por lo que se procederá a transformarla en dummy:
df["start_feb_mar_ab"]=0
df.loc[df["rewards_start"].isin(["2021-02","2021-03","2021-04"]),"start_feb_mar_ab"]=1
df
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | mix_retornable | mix_premium | bultos_por_transaccion | canal_desc | subchannel_desc | bees_start | rewards_start | ln_volumen | ln_bultos_por_transaccion | start_feb_mar_ab | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202975 | 2022-08 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1935.0 | 1.0 | 2124.0 | ... | NaN | NaN | NaN | Tradicional | Almacen | 2021-03 | 2021-03 | NaN | NaN | 1 |
| 2058 | 663511 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 1631.0 | ... | NaN | NaN | NaN | Kioscos/maxikioscos | Kiosco/maxikiosco | 2021-04 | 2021-04 | NaN | NaN | 1 |
| 9918 | 907959 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 26331.0 | 1.0 | 1832.0 | ... | NaN | NaN | NaN | Kioscos/maxikioscos | Kiosco/maxikiosco | 2022-01 | 2022-01 | NaN | NaN | 0 |
| 9936 | 908499 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 16673.0 | ... | NaN | NaN | NaN | Kioscos/maxikioscos | Kiosco/maxikiosco | 2021-12 | 2021-12 | NaN | NaN | 0 |
| 9942 | 908517 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 88000.0 | 1.0 | 19552.0 | ... | NaN | NaN | NaN | Tradicional | Almacen | 2021-12 | 2021-12 | NaN | NaN | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11490 | 1364333 | 2023-10 | 2.0 | 2.0 | 2.0 | 0.0 | 832.0 | 10030.0 | 1.0 | 2022.0 | ... | 60.27 | 13.81 | 4.00 | Tradicional | Almacen | 2021-03 | 2021-03 | -0.127833 | 1.386294 | 1 |
| 11478 | 1364321 | 2023-10 | 1.0 | 0.0 | 0.0 | 0.0 | 750.0 | 0.0 | 0.0 | 2157.0 | ... | 56.62 | 68.27 | 2.58 | Kioscos/maxikioscos | Kiosco Ventana | 2021-03 | 2021-03 | -1.427116 | 0.947789 | 1 |
| 1614 | 622797 | 2023-10 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1553.0 | ... | 100.00 | 0.00 | 1.00 | Tradicional | Almacen | 2021-04 | 2021-04 | -2.120264 | 0.000000 | 1 |
| 11346 | 926101 | 2023-10 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4198.0 | ... | 36.68 | 17.46 | 8.17 | Kioscos/maxikioscos | Kiosco Ventana | 2022-11 | 2022-11 | 1.184790 | 2.100469 | 0 |
| 16363 | 1376219 | 2023-10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 58.30 | 100.00 | 1.67 | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 | -0.916291 | 0.512824 | 0 |
16364 rows × 26 columns
Tambien se mencionó que las latas (asi como retornables y premium) seguian una distribucion tendiendo a binaria, por lo que aplicamos funciones lambda para convertirlas en categoricas.
df["latas_cat"]=df["mix_latas"].apply(lambda x: str(x) if (x==0) or (x==100) else "rest")
df["retornable_cat"]=df["mix_retornable"].apply(lambda x: str(x) if (x==0) or (x==100) else "rest")
df["premium_cat"]=df["mix_premium"].apply(lambda x: str(x) if (x==0) or (x==100) else "rest")
df
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | canal_desc | subchannel_desc | bees_start | rewards_start | ln_volumen | ln_bultos_por_transaccion | start_feb_mar_ab | latas_cat | retornable_cat | premium_cat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202975 | 2022-08 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1935.0 | 1.0 | 2124.0 | ... | Tradicional | Almacen | 2021-03 | 2021-03 | NaN | NaN | 1 | rest | rest | rest |
| 2058 | 663511 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 1631.0 | ... | Kioscos/maxikioscos | Kiosco/maxikiosco | 2021-04 | 2021-04 | NaN | NaN | 1 | rest | rest | rest |
| 9918 | 907959 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 26331.0 | 1.0 | 1832.0 | ... | Kioscos/maxikioscos | Kiosco/maxikiosco | 2022-01 | 2022-01 | NaN | NaN | 0 | rest | rest | rest |
| 9936 | 908499 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 16673.0 | ... | Kioscos/maxikioscos | Kiosco/maxikiosco | 2021-12 | 2021-12 | NaN | NaN | 0 | rest | rest | rest |
| 9942 | 908517 | 2022-08 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 88000.0 | 1.0 | 19552.0 | ... | Tradicional | Almacen | 2021-12 | 2021-12 | NaN | NaN | 0 | rest | rest | rest |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11490 | 1364333 | 2023-10 | 2.0 | 2.0 | 2.0 | 0.0 | 832.0 | 10030.0 | 1.0 | 2022.0 | ... | Tradicional | Almacen | 2021-03 | 2021-03 | -0.127833 | 1.386294 | 1 | rest | rest | rest |
| 11478 | 1364321 | 2023-10 | 1.0 | 0.0 | 0.0 | 0.0 | 750.0 | 0.0 | 0.0 | 2157.0 | ... | Kioscos/maxikioscos | Kiosco Ventana | 2021-03 | 2021-03 | -1.427116 | 0.947789 | 1 | rest | rest | rest |
| 1614 | 622797 | 2023-10 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1553.0 | ... | Tradicional | Almacen | 2021-04 | 2021-04 | -2.120264 | 0.000000 | 1 | 0.0 | 100.0 | 0.0 |
| 11346 | 926101 | 2023-10 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4198.0 | ... | Kioscos/maxikioscos | Kiosco Ventana | 2022-11 | 2022-11 | 1.184790 | 2.100469 | 0 | rest | rest | rest |
| 16363 | 1376219 | 2023-10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | Tradicional | Panaderia / Pasteleria | 2023-06 | 2023-06 | -0.916291 | 0.512824 | 0 | 0.0 | rest | 100.0 |
16364 rows × 29 columns
Transformamos mas variables a categoricas:
df = pd.get_dummies(df, columns=['canal_desc'], drop_first=True)
cat_cols = ['subchannel_desc', 'latas_cat', 'retornable_cat', 'premium_cat']
#'subchannel_desc_Almacen Autoservice',
# 'subchannel_desc_Autoservicio Independiente',
# 'subchannel_desc_Carniceria/polleria', 'subchannel_desc_Dietetica',
# 'subchannel_desc_Farmacia', 'subchannel_desc_Fiambreria/queseria',
# 'subchannel_desc_Kiosco Ventana', 'subchannel_desc_Kiosco/maxikiosco',
# 'subchannel_desc_Otros K+t', 'subchannel_desc_Panaderia / Pasteleria',
# 'subchannel_desc_Rotiseria/com P Llevar',
# 'subchannel_desc_Tienda De Bebidas', 'subchannel_desc_Verduleria',
# 'subchannel_desc_Vinoteca', 'latas_cat_100.0', 'latas_cat_rest',
# 'retornable_cat_100.0', 'retornable_cat_rest', 'premium_cat_100.0',
# 'premium_cat_rest'
# Aplicar one-ht encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
Cambiamos el formato de la fecha porque nos tira error:
#distintas soluciones que fui probando
# df['mes'] = pd.to_datetime(df['mes'], format='%Y-%m')
# df['bees_start'] = pd.to_datetime(df['bees_start'], format='%Y-%m')
# df['rewards_start'] = pd.to_datetime(df['rewards_start'], format='%Y-%m')
df['mes'] = pd.to_datetime(df['mes']).astype('int64')
df['bees_start'] = pd.to_datetime(df['bees_start']).astype('int64')
df['rewards_start'] = pd.to_datetime(df['rewards_start']).astype('int64')
# df['mes'] = df['mes'].apply(lambda x: x.timestamp())
# df['bees_start'] = df['bees_start'].apply(lambda x: x.timestamp())
# df['rewards_start'] = df['rewards_start'].apply(lambda x: x.timestamp())
df
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | subchannel_desc_Rotiseria/com P Llevar | subchannel_desc_Tienda De Bebidas | subchannel_desc_Verduleria | subchannel_desc_Vinoteca | latas_cat_100.0 | latas_cat_rest | retornable_cat_100.0 | retornable_cat_rest | premium_cat_100.0 | premium_cat_rest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202975 | 1659312000000000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1935.0 | 1.0 | 2124.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 2058 | 663511 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 1631.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 9918 | 907959 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 26331.0 | 1.0 | 1832.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 9936 | 908499 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 16673.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 9942 | 908517 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 88000.0 | 1.0 | 19552.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11490 | 1364333 | 1696118400000000000 | 2.0 | 2.0 | 2.0 | 0.0 | 832.0 | 10030.0 | 1.0 | 2022.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 11478 | 1364321 | 1696118400000000000 | 1.0 | 0.0 | 0.0 | 0.0 | 750.0 | 0.0 | 0.0 | 2157.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 1614 | 622797 | 1696118400000000000 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1553.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 11346 | 926101 | 1696118400000000000 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4198.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 16363 | 1376219 | 1696118400000000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
16364 rows × 46 columns
Para evitar hacerlo demasiado complejo dado el poco tiempo que hay para resolver el ejercicio, no se utilizarán variables con lag ni promedios simples en el modelo. Pero se me ocurrió un posible codigo el cual dejo a continuacion.
df["mes"].unique()
array([1659312000000000000, 1661990400000000000, 1664582400000000000,
1667260800000000000, 1669852800000000000, 1672531200000000000,
1675209600000000000, 1677628800000000000, 1680307200000000000,
1682899200000000000, 1685577600000000000, 1688169600000000000,
1690848000000000000, 1693526400000000000, 1696118400000000000])
# alldfs=[]
# for index, i in enumerate(df["mes"].unique()):
# selected_months=df["mes"].unique()[:index]
# # print(selected_months,i)
# alldfs.append(df[df["mes"].isin(selected_months)].groupby("cliente_id",as_index=False).mean().assign(mes=i))
# to_merge=pd.concat(alldfs)
# to_merge
# alldfs=[]
# for index, i in enumerate(df["mes"].unique()):
# selected_months=df["mes"].unique()[:index]
# selected_months=[selected_months[-1]] if len(selected_months)>=1 else []
# # print(selected_months,i)
# alldfs.append(df[df["mes"].isin(selected_months)].groupby("cliente_id",as_index=False).mean().assign(mes=i))
# to_merge2=pd.concat(alldfs)
# to_merge2
# to_merge=to_merge[[i for i in to_merge.columns if "average_unitil_date" not in i]]
# to_merge
# to_merge.columns=['cliente_id']+[i+"_average_unitil_date" for i in [ 'asignados', 'vistos', 'aceptados', 'completados',
# 'puntos_ganados_total', 'puntos_canjeados_total', 'cantidad_canjes',
# 'puntos_saldo_fin_mes', 'transacciones', 'volumen', 'bultos',
# 'productos_unicos', 'marcas_unicas', 'mix_latas', 'mix_retornable',
# 'mix_premium', 'bultos_por_transaccion', 'ln_volumen', 'ln_bultos',
# 'ln_productos_unicos', 'ln_bultos_por_transaccion', 'start_feb_mar_ab',]]+['mes']
# to_merge2.columns=['cliente_id']+[i+"_lagged" for i in [ 'asignados', 'vistos', 'aceptados', 'completados',
# 'puntos_ganados_total', 'puntos_canjeados_total', 'cantidad_canjes',
# 'puntos_saldo_fin_mes', 'transacciones', 'volumen', 'bultos',
# 'productos_unicos', 'marcas_unicas', 'mix_latas', 'mix_retornable',
# 'mix_premium', 'bultos_por_transaccion', 'ln_volumen', 'ln_bultos',
# 'ln_productos_unicos', 'ln_bultos_por_transaccion', 'start_feb_mar_ab',]]+['mes']
# df=df.merge(to_merge,on=["cliente_id","mes"],how="outer")
# df=df.merge(to_merge2,on=["cliente_id","mes"],how="outer")
back_df=df.copy()
len(df.columns)
46
df.columns
Index(['cliente_id', 'mes', 'asignados', 'vistos', 'aceptados', 'completados',
'puntos_ganados_total', 'puntos_canjeados_total', 'cantidad_canjes',
'puntos_saldo_fin_mes', 'transacciones', 'volumen', 'bultos',
'productos_unicos', 'marcas_unicas', 'mix_latas', 'mix_retornable',
'mix_premium', 'bultos_por_transaccion', 'bees_start', 'rewards_start',
'ln_volumen', 'ln_bultos_por_transaccion', 'start_feb_mar_ab',
'canal_desc_Kioscos/maxikioscos', 'canal_desc_Tradicional',
'subchannel_desc_Almacen Autoservice',
'subchannel_desc_Autoservicio Independiente',
'subchannel_desc_Carniceria/polleria', 'subchannel_desc_Dietetica',
'subchannel_desc_Farmacia', 'subchannel_desc_Fiambreria/queseria',
'subchannel_desc_Kiosco Ventana', 'subchannel_desc_Kiosco/maxikiosco',
'subchannel_desc_Otros K+t', 'subchannel_desc_Panaderia / Pasteleria',
'subchannel_desc_Rotiseria/com P Llevar',
'subchannel_desc_Tienda De Bebidas', 'subchannel_desc_Verduleria',
'subchannel_desc_Vinoteca', 'latas_cat_100.0', 'latas_cat_rest',
'retornable_cat_100.0', 'retornable_cat_rest', 'premium_cat_100.0',
'premium_cat_rest'],
dtype='object')
df.loc[df["cliente_id"]==202975,["mes"]+[i for i in df.columns if "asignados" in i]]
| mes | asignados | |
|---|---|---|
| 0 | 1659312000000000000 | 1.0 |
| 13142 | 1661990400000000000 | NaN |
| 1 | 1664582400000000000 | 1.0 |
| 2 | 1667260800000000000 | 3.0 |
| 3 | 1669852800000000000 | 1.0 |
| 4 | 1672531200000000000 | 1.0 |
| 5 | 1675209600000000000 | 2.0 |
| 6 | 1677628800000000000 | 1.0 |
| 7 | 1680307200000000000 | 2.0 |
| 8 | 1682899200000000000 | 1.0 |
| 9 | 1685577600000000000 | 4.0 |
| 10 | 1688169600000000000 | 1.0 |
| 11 | 1690848000000000000 | 1.0 |
| 12 | 1693526400000000000 | 2.0 |
| 13 | 1696118400000000000 | 1.0 |
# df=df.rename(columns={i:"_".join(i.split("_")[:-1]) for i in df.columns if "average_unitil_date_y" in i})
#blocks
id_cols=['cliente_id', 'mes','canal_desc_Kioscos/maxikioscos', 'canal_desc_Tradicional']
desition_col=['asignados',]
leak_cols=['vistos', 'aceptados', 'completados',] #leak
puntos_cols=['puntos_canjeados_total', 'cantidad_canjes','puntos_saldo_fin_mes',]
transacciones_cols=['transacciones', 'ln_bultos_por_transaccion', 'ln_volumen','productos_unicos',
'marcas_unicas', 'mix_latas', 'mix_retornable','mix_premium',]
cat_cols=['rewards_start', 'start_feb_mar_ab', 'subchannel_desc_Almacen Autoservice',
'subchannel_desc_Autoservicio Independiente',
'subchannel_desc_Carniceria/polleria', 'subchannel_desc_Dietetica',
'subchannel_desc_Farmacia', 'subchannel_desc_Fiambreria/queseria',
'subchannel_desc_Kiosco Ventana', 'subchannel_desc_Kiosco/maxikiosco', 'latas_cat_100.0', 'latas_cat_rest',
'retornable_cat_100.0', 'retornable_cat_rest', 'premium_cat_100.0',
'premium_cat_rest']
# 'subchannel_desc_Otros K+t', 'subchannel_desc_Panaderia / Pasteleria',
# 'subchannel_desc_Rotiseria/com P Llevar',
# 'subchannel_desc_Tienda De Bebidas', 'subchannel_desc_Verduleria',
# 'subchannel_desc_Vinoteca'
df
| cliente_id | mes | asignados | vistos | aceptados | completados | puntos_ganados_total | puntos_canjeados_total | cantidad_canjes | puntos_saldo_fin_mes | ... | subchannel_desc_Rotiseria/com P Llevar | subchannel_desc_Tienda De Bebidas | subchannel_desc_Verduleria | subchannel_desc_Vinoteca | latas_cat_100.0 | latas_cat_rest | retornable_cat_100.0 | retornable_cat_rest | premium_cat_100.0 | premium_cat_rest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202975 | 1659312000000000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1935.0 | 1.0 | 2124.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 2058 | 663511 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 1631.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 9918 | 907959 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 26331.0 | 1.0 | 1832.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 9936 | 908499 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 616.0 | 0.0 | 0.0 | 16673.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 9942 | 908517 | 1659312000000000000 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 88000.0 | 1.0 | 19552.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11490 | 1364333 | 1696118400000000000 | 2.0 | 2.0 | 2.0 | 0.0 | 832.0 | 10030.0 | 1.0 | 2022.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 11478 | 1364321 | 1696118400000000000 | 1.0 | 0.0 | 0.0 | 0.0 | 750.0 | 0.0 | 0.0 | 2157.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 1614 | 622797 | 1696118400000000000 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1553.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 11346 | 926101 | 1696118400000000000 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4198.0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 16363 | 1376219 | 1696118400000000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
16364 rows × 46 columns
x_cols=id_cols+desition_col+puntos_cols+transacciones_cols+cat_cols
# x_cols=x_cols+[i+"_average_unitil_date" for i in x_cols+leak_cols]
# x_cols=[i for i in x_cols if i not in [
# 'cliente_id_average_unitil_date', 'mes_average_unitil_date', 'latas_cat_average_unitil_date',
# 'retornable_cat_average_unitil_date', 'premium_cat_average_unitil_date', 'canal_desc_average_unitil_date',
# 'subchannel_desc_average_unitil_date', 'rewards_start_average_unitil_date']]
# x_cols=x_cols[2:]
x_cols
['cliente_id', 'mes', 'canal_desc_Kioscos/maxikioscos', 'canal_desc_Tradicional', 'asignados', 'puntos_canjeados_total', 'cantidad_canjes', 'puntos_saldo_fin_mes', 'transacciones', 'ln_bultos_por_transaccion', 'ln_volumen', 'productos_unicos', 'marcas_unicas', 'mix_latas', 'mix_retornable', 'mix_premium', 'rewards_start', 'start_feb_mar_ab', 'subchannel_desc_Almacen Autoservice', 'subchannel_desc_Autoservicio Independiente', 'subchannel_desc_Carniceria/polleria', 'subchannel_desc_Dietetica', 'subchannel_desc_Farmacia', 'subchannel_desc_Fiambreria/queseria', 'subchannel_desc_Kiosco Ventana', 'subchannel_desc_Kiosco/maxikiosco', 'latas_cat_100.0', 'latas_cat_rest', 'retornable_cat_100.0', 'retornable_cat_rest', 'premium_cat_100.0', 'premium_cat_rest']
y_col='completados'
df.columns
Index(['cliente_id', 'mes', 'asignados', 'vistos', 'aceptados', 'completados',
'puntos_ganados_total', 'puntos_canjeados_total', 'cantidad_canjes',
'puntos_saldo_fin_mes', 'transacciones', 'volumen', 'bultos',
'productos_unicos', 'marcas_unicas', 'mix_latas', 'mix_retornable',
'mix_premium', 'bultos_por_transaccion', 'bees_start', 'rewards_start',
'ln_volumen', 'ln_bultos_por_transaccion', 'start_feb_mar_ab',
'canal_desc_Kioscos/maxikioscos', 'canal_desc_Tradicional',
'subchannel_desc_Almacen Autoservice',
'subchannel_desc_Autoservicio Independiente',
'subchannel_desc_Carniceria/polleria', 'subchannel_desc_Dietetica',
'subchannel_desc_Farmacia', 'subchannel_desc_Fiambreria/queseria',
'subchannel_desc_Kiosco Ventana', 'subchannel_desc_Kiosco/maxikiosco',
'subchannel_desc_Otros K+t', 'subchannel_desc_Panaderia / Pasteleria',
'subchannel_desc_Rotiseria/com P Llevar',
'subchannel_desc_Tienda De Bebidas', 'subchannel_desc_Verduleria',
'subchannel_desc_Vinoteca', 'latas_cat_100.0', 'latas_cat_rest',
'retornable_cat_100.0', 'retornable_cat_rest', 'premium_cat_100.0',
'premium_cat_rest'],
dtype='object')
Ahora, el tratamiento de los nans puede ser variado. Se puede eliminar las columnas, que es lo que haré por el poco tiempo que hay para hacer el proyecto, se pueden imputar valores con la media, mediana o moda de la columna, o incluso algoritmos de machine learning.
# eliminamos las filas con nulls para poder correr el modelo
df_clean = df.dropna()
X_clean = df_clean[x_cols].to_numpy()
y_clean = df_clean[y_col].to_numpy()
X=df[x_cols].to_numpy()
y=df[y_col].to_numpy()
Ahora chequeamos que las dimensiones de ambos X e y sean las mismas para poder entrenar y testear el modelo.
print(X_clean.shape)
print(y_clean.shape)
(9466, 32) (9466,)
num_columnas = X.shape[1]
for i in range(num_columnas):
print("Columna", i+1)
Columna 1 Columna 2 Columna 3 Columna 4 Columna 5 Columna 6 Columna 7 Columna 8 Columna 9 Columna 10 Columna 11 Columna 12 Columna 13 Columna 14 Columna 15 Columna 16 Columna 17 Columna 18 Columna 19 Columna 20 Columna 21 Columna 22 Columna 23 Columna 24 Columna 25 Columna 26 Columna 27 Columna 28 Columna 29 Columna 30 Columna 31 Columna 32
Chequeamos que todos los valores sean numericos
df.dtypes
cliente_id int64 mes int64 asignados float64 vistos float64 aceptados float64 completados float64 puntos_ganados_total float64 puntos_canjeados_total float64 cantidad_canjes float64 puntos_saldo_fin_mes float64 transacciones float64 volumen float64 bultos float64 productos_unicos float64 marcas_unicas float64 mix_latas float64 mix_retornable float64 mix_premium float64 bultos_por_transaccion float64 bees_start int64 rewards_start int64 ln_volumen float64 ln_bultos_por_transaccion float64 start_feb_mar_ab int64 canal_desc_Kioscos/maxikioscos uint8 canal_desc_Tradicional uint8 subchannel_desc_Almacen Autoservice uint8 subchannel_desc_Autoservicio Independiente uint8 subchannel_desc_Carniceria/polleria uint8 subchannel_desc_Dietetica uint8 subchannel_desc_Farmacia uint8 subchannel_desc_Fiambreria/queseria uint8 subchannel_desc_Kiosco Ventana uint8 subchannel_desc_Kiosco/maxikiosco uint8 subchannel_desc_Otros K+t uint8 subchannel_desc_Panaderia / Pasteleria uint8 subchannel_desc_Rotiseria/com P Llevar uint8 subchannel_desc_Tienda De Bebidas uint8 subchannel_desc_Verduleria uint8 subchannel_desc_Vinoteca uint8 latas_cat_100.0 uint8 latas_cat_rest uint8 retornable_cat_100.0 uint8 retornable_cat_rest uint8 premium_cat_100.0 uint8 premium_cat_rest uint8 dtype: object
df['completados'].value_counts()
0.0 11584 1.0 1285 2.0 235 3.0 38 Name: completados, dtype: int64
Tambien, queremos que los valores de "completados" sean 1 y 0. Porque pide el enunciado que se complete AL MENOS un desafio.
# np.unique(y_clean)
# y = np.where(y_clean == 0, 0, 1)
np.unique(y_clean)
y_clean = np.where(y_clean == 0, 0, 1)
En primer lugar, tenemos que hacer el split:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, stratify=y_clean, random_state=8)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(7572, 32) (7572,) (1894, 32) (1894,)
Elegí el modelo Random Forest ya que es optimo para clasificaciones. Consideré tambien XGBoost y modelo de regresion logistica, los cuales probaré luego para comparar.
La idea principal es setear los hiperparametros adecuados (n_estimators y min_samples_leaf), para eso los testeare de la siguiente manera:
val_error=[]
for i in range(20):
rf=RandomForestClassifier(n_estimators=100,min_samples_leaf=i+1) #probar con 100 y 1000
rf.fit(X_train,y_train)
yvalpred=rf.predict(X_test)
error=mean_squared_error(y_test,yvalpred)
val_error.append(error)
px.line(x=[i+1 for i in range(20)],y=val_error).show()
min_error = min(val_error)
min_error_index = val_error.index(min_error)
best_min_samples_leaf = min_error_index + 1
print("El mejor valor de min_samples_leaf es:", best_min_samples_leaf)
El mejor valor de min_samples_leaf es: 4
param_grid = {
'n_estimators': [100, 500, 1000],
'min_samples_leaf': [1, 5, 10]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
# Definimos los mejores hiperparametros
best_params = grid_search.best_params_
print("Mejores hiperparámetros:", best_params)
# mejor modeelo
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Exactitud del mejor modelo:", accuracy)
Mejores hiperparámetros: {'min_samples_leaf': 1, 'n_estimators': 500}
Exactitud del mejor modelo: 0.8643083421330517
Corremos el modelo con el resultado que nos dio:
rf = RandomForestClassifier(n_estimators=1000, min_samples_leaf=10)
rf.fit(X_train, y_train)
yvalpred = rf.predict(X_test)
error = mean_squared_error(y_test, yvalpred)
accuracy = accuracy_score(y_test, yvalpred)
precision = precision_score(y_test, yvalpred, average='weighted')
recall = recall_score(y_test, yvalpred, average='weighted')
f1 = f1_score(y_test, yvalpred, average='weighted')
conf_matrix = confusion_matrix(y_test, yvalpred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Matriz de Confusión:")
print(conf_matrix)
Accuracy: 0.8611404435058078 Precision: 0.8386803743464342 Recall: 0.8611404435058078 F1 Score: 0.8201307292306208 Matriz de Confusión: [[1591 19] [ 244 40]]
# Utilizamos el x_test para calcular las probabilidades
probabilidades = rf.predict_proba(X_test)
probabilidades_completado = probabilidades[:, 1]
print(probabilidades_completado)
[0.14604726 0.15075584 0.09121142 ... 0.18393944 0.08871448 0.05624337]
# probabilidades de completar al menos un desafío (de la mayor a la menor)
indices_ordenados = np.argsort(probabilidades_completado)[::-1]
top_150_clientes_indices = indices_ordenados[:150]
# obtenemos ids
top_150_clientes_ids = X_test[top_150_clientes_indices, 0]
print(top_150_clientes_ids)
[ 856049. 857409. 830791. 873427. 826807. 813409. 873427. 777915. 879347. 838699. 857409. 834793. 893717. 890929. 844227. 900401. 732439. 870449. 871221. 778705. 713751. 903717. 862211. 844693. 862451. 760911. 847255. 870307. 889673. 314887. 913791. 917217. 864285. 871221. 840857. 900405. 929683. 235937. 900401. 739129. 834953. 890929. 882181. 627157. 745627. 335371. 814023. 910787. 864291. 885299. 278997. 235937. 840857. 848601. 667939. 762171. 889569. 762845. 857841. 1364333. 855159. 861957. 828771. 830787. 850561. 888977. 250995. 908517. 838697. 868171. 859677. 876937. 874251. 794543. 865697. 902949. 794507. 1365553. 897531. 913791. 876937. 782607. 862211. 732443. 218421. 878187. 800049. 351345. 610085. 796543. 820873. 920261. 885411. 914699. 909227. 820873. 758305. 914699. 833497. 732443. 846523. 875177. 1365607. 870449. 314887. 883399. 819945. 856381. 607683. 669481. 762845. 630397. 1371505. 250995. 870969. 353827. 883399. 741501. 908803. 794301. 862747. 893717. 828771. 872011. 356809. 794507. 897531. 848531. 885853. 1383761. 782721. 857841. 716861. 861377. 758345. 864829. 859677. 905747. 366115. 911823. 875167. 911965. 825255. 867231. 910935. 799811. 889039. 745627. 888977. 874251.]
# exportamos a la ruta del drive en formato csv
#df_top_150_clientes = pd.DataFrame({'cliente_id': top_150_clientes_ids})
#ruta_csv = '/content/drive/MyDrive/Colab Notebooks/Interview/top_150_clientes.csv'
#df_top_150_clientes.to_csv(ruta_csv, index=False)